---
title: "CMS Text Analysis"
editor_options: null
output:
  pdf_document: default
  html_document: default
chunk_output_type: console
---
```{r, echo = FALSE, message = FALSE, warning = FALSE, error = FALSE, include = FALSE}
setwd("~/Dropbox/Projects/JPART Text Symposium")
```
  
## Preprocessing
  
# General Preprocessing

We first clear the workspace and load the relevant packages.

```{r}
rm(list=ls())
library(httr)
library(jsonlite)
library(openxlsx)
library(gdata)
library(lubridate)
library(qdap)
library(tm)
library(tidytext)
library(quanteda)
library(stm)
library(readr)
library(data.table)
library(readtext)
library(tidyverse)
library(texreg)
library(strucchange)
```

We now create a subdirectory in case it doesn't exist. This subdirectory holds the downloaded texts.

```{r}
if(!dir.exists("CMS Documents")){
  dir.create("CMS Documents")
}
```

Download the data from the Federal Register website. Prior to this I examined the URL format used by the search function at http://www.federalregister.gov and noted how it was formatted for requesting documents from both the HCFA and the CMS for each year from 1994 to 2017. Year-specific searches are conducted because federalregister.gov limits mass downloads to 1000 documents each.

This first chunk of code downloads the URLs of the text documents and saves them to year-specific CSV files.

```{r, results="hide"}
cmsyears <- c(1994:2017)
for(i in cmsyears){
  download.file(paste("https://www.federalregister.gov/documents/search.csv?conditions%5Bagencies%5D%5B%5D=centers-for-medicare-medicaid-services&conditions%5Bpublication_date%5D%5Byear%5D=",i,sep=""),
                destfile = paste("CMS Documents/cms",i,".csv",sep=""))
  download.file(paste("https://www.federalregister.gov/documents/search.csv?conditions%5Bagencies%5D%5B%5D=health-care-finance-administration&conditions%5Bpublication_date%5D%5Byear%5D=",i,sep=""),
                destfile = paste("CMS Documents/hcfa",i,".csv",sep=""))
}

cmsinfo <- NULL
hcfainfo <- NULL
for(i in cmsyears){
  cmsinfo <- rbind(cmsinfo, read.csv(paste("CMS Documents/cms",i,".csv",sep="")))
  hcfainfo <- rbind(hcfainfo, read.csv(paste("CMS Documents/hcfa",i,".csv",sep="")))
}

combined <- rbind(data.frame(cmsinfo, agency = "CMS"),
                  data.frame(hcfainfo, agency = "HCFA"))
```

This next chunk runs through the list of document URLs in the saved CSV files and pulls their titles, text, URLs, and other relevant metadata. Note that the "badmatches" object is created to account for some irregularities in how the Federal Register saves documents; these documents are read and saved using an alternative method.

```{r}
combined$texturl <- NA
combined$text <- NA
combined$rawtext <- NA

`%not in%` <- function (x, table) is.na(match(x, table, nomatch=NA_integer_))

for(i in 1:dim(combined)[1]){
  foo <- as.character(combined$html_url[i])
  combined$texturl[i] <- gsub(paste(trim(as.character(combined$document_number[i])),
                                    ".*",sep=""), 
                              paste(trim(as.character(combined$document_number[i])),
                                    ".txt",sep=""), 
                              gsub("documents/","documents/full_text/text/",foo))
  footext <- tryCatch(readtext(combined$texturl[i])$text,
                      error = function(e){paste("something wrong here")})
  combined$rawtext[i] <- footext
  footext <- gsub("\n"," ",footext)
  footext <- gsub("\\[[^\\]]*\\]", " ", footext, perl=TRUE)
  footext <- bracketX(footext, "angle")
  footext <- gsub("[|]|]", " ", footext)
  footext <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", " ", footext)
  footext <- gsub(".*SUMMARY:"," ",footext)
  footext <- gsub(".*AGENCY:"," ",footext)
  combined$text[i] <- footext
}


badmatches <- which(combined$text == "something wrong here")

for(i in badmatches){
  foo <- as.character(combined$html_url[i])
  foodate <- as.Date(as.character(combined$publication_date[i]), format = "%m/%d/%Y")
  fooyear <- year(foodate)
  foomonth <- sprintf("%02d", month(foodate))
  fooday <- sprintf("%02d", day(foodate))
  foodoc <- trim(as.character(combined$document_number[i]))
  combined$texturl[i] <- paste("https://www.gpo.gov/fdsys/pkg/FR-",
                               fooyear,"-",foomonth,"-",fooday,
                               "/html/",foodoc,".htm",sep="")
  footext <- read_file(combined$texturl[i])
  combined$rawtext[i] <- footext
  footext <- gsub("\n"," ",footext)
  footext <- gsub("\\[[^\\]]*\\]", " ", footext, perl=TRUE)
  footext <- bracketX(footext, "angle")
  footext <- gsub("[|]|]", " ", footext)
  footext <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", " ", footext)
  footext <- gsub(".*SUMMARY:"," ",footext)
  footext <- gsub(".*AGENCY:"," ",footext)
  combined$text[i] <- footext
}
```

Format the date variable as a "Date" object for later analysis, and create a numeric version for easier inclusion into the structural topic models.

```{r}
combined$publication_date <- as.Date(combined$publication_date,
                                     format = "%m/%d/%Y")

combined$numeric_date <- as.numeric(combined$publication_date)
```

Label the different types of documents.

```{r}
combined$type <- factor(combined$type, 
                        levels = c("Notice", "Correction", 
                                   "Proposed Rule", "Rule", 
                                   "Uncategorized Document"))
```

We now include some outside covariates, including a price index for calculating spending in real dollars, the amount of federal spending on Medicaid and Medicare, and a list of CMS/HCFA directors and their tenures. Note that the "cms-directors.csv" file also includes information on CFScores for political principals, as well as other political variables (e.g., partisan control of Congress).

```{r}
inflation <- read.csv("inflation.csv")
spending <- read.csv("spending.csv")
spending <- merge(inflation, spending)
spending$medicare_fed_real <- spending$medicare_fed/spending$index2000
spending$medicaid_fed_real <- spending$medicaid_fed/spending$index2000
comminfo <-  read.csv("cms-directors.csv")
```

Format the CMA/HCFA tenure dates for analysis.

```{r}
comminfo$end <- as.Date(comminfo$end, format = "%m/%d/%y")
comminfo$end[is.na(comminfo$end)] <- "2017-12-31"
comminfo$start <- as.Date(comminfo$start, format = "%m/%d/%y")
```

We can now match the documents to the administrators and other political variables that were relevant at the time of publication.

```{r}
commids <- NULL
for(i in unique(comminfo$id)){
  commids <- rbind(commids, 
                   setDT(comminfo)[i,][, list(publication_date=seq(start, end, by = 'day')), id])
}

combined$year <- year(combined$publication_date)

combined <- merge(combined, spending)
commmerged <- merge(commids, comminfo)

combined <- merge(combined, commmerged)

combined$presagencydiff <- abs(combined$cfscore - combined$pres_cfscore)

combined$senpolarization <- abs(combined$sen_reps_cfscore - 
                                combined$sen_dems_cfscore)

combined$housepolarization <- abs(combined$house_reps_cfscore - 
                                  combined$house_dems_cfscore)

combined$presfildiff <- abs(combined$pres_cfscore - 
                            combined$fil_cfscore)

combined$pressendiff <- abs(combined$pres_cfscore - 
                            combined$sen_median_cfscore)

combined$preshousediff <- abs(combined$pres_cfscore - 
                              combined$house_median_cfscore)

combined$prescongdiff <- apply(subset(combined, select = c(pressendiff,
                                                           preshousediff,
                                                           presfildiff)),
                               1, max)

combined$agencyfildiff <- abs(combined$cfscore - 
                              combined$fil_cfscore)

combined$agencysendiff <- abs(combined$cfscore - 
                              combined$sen_median_cfscore)

combined$agencyhousediff <- abs(combined$cfscore - 
                                combined$house_median_cfscore)

combined$agencycongdiff <- apply(subset(combined, select = c(agencysendiff,
                                                             agencyhousediff,
                                                             agencyfildiff)),
                                 1, max)


combined$partisan <- apply(subset(combined,
                                  select = c(pres_rep,
                                             house_rep,
                                             sen_rep)),
                           1, sum)

combined$unified <- as.numeric(combined$partisan %in% c(0,3))

combined$relprescongdist <- combined$presagencydiff - combined$agencycongdiff

combined$z_relprescongdist <- as.numeric(scale(combined$relprescongdist))
combined$z_prescongdiff <- as.numeric(scale(combined$prescongdiff))
combined$z_cfscore <- as.numeric(scale(combined$cfscore))

combined.trim <- subset(combined, !is.na(cfscore) & 
                                  !is.na(pres_cfscore) & 
                                  congress < 115)

combined.trim$type <- factor(combined.trim$type)
```

# Textual Preprocessing

We now turn to the preprocessing of the textual data. First, we run the textProcessor() command. This stems the text, removes stopwords and agency names, removes (most) HTML markup and other punctuation, removes numbers, and stems the words. Then we run the prepDocuments() command, which takes the output from textProcessor() and creates document-term matrices.

```{r}
combined.processed <- textProcessor(combined.trim$text, 
                                    customstopwords = c("hcfa", "cms",
                                                        "centers for medicare and medicaid services",
                                                        "centers for medicare & medicaid services",
                                                        "centers for medicaid and medicare services",
                                                        "centers for medicaid & medicare services",
                                                        "centers for medicare and medicaid service",
                                                        "centers for medicare & medicaid service",
                                                        "centers for medicaid and medicare service",
                                                        "centers for medicaid & medicare service",
                                                        "center for medicare and medicaid services",
                                                        "center for medicare & medicaid services",
                                                        "center for medicaid and medicare services",
                                                        "center for medicaid & medicare services",
                                                        "center for medicare and medicaid service",
                                                        "center for medicare & medicaid service",
                                                        "center for medicaid and medicare service",
                                                        "center for medicaid & medicare service",
                                                        "health care finance administration",
                                                        "health care financing administration"),
                                    metadata = combined.trim)  # need to correct some of the things above


combined.out <- prepDocuments(combined.processed$documents, combined.processed$vocab, 
                              combined.processed$meta, 
                              lower.thresh = floor(length(combined.processed$documents)*.01),
                              upper.thresh = ceiling(length(combined.processed$documents)*0.99))
```

## Estimation of the STM

We now proceed to model estimation.

# Searching for Topics

We first use the searchK() function to estimate models across a wide range of topics. We estimate models with as few as two topics and as many as forty.


```{r}
set.seed(1384)
combined.sK40 <- searchK(documents = combined.out$documents,
                        vocab = combined.out$vocab,
                        K = 2:40, # to calculate the "optimal" nubmer of topics
                        prevalence =  ~ s(numeric_date) + 
                          factor(type) + factor(agency) + 
                          unified * (z_relprescongdist + z_prescongdiff) + 
                          z_cfscore + acting + 
                          log(medicaid_fed_real) + log(medicare_fed_real),
                        init.type = "Spectral",
                        heldout.seed = 1384,
                        max.em.its = 1000, 
                        data = combined.out$meta)
```

This chunk of code manipulates the output of the searchK() command into a format suitable for plotting.

```{r}
fit.indices <- data.frame(K = rep(combined.sK40$results$K, 4),
                          fit = c(combined.sK40$results$exclus,
                                  combined.sK40$results$semcoh,
                                  combined.sK40$results$heldout,
                                  combined.sK40$results$lbound),
                          stat = c(rep("Exclusivity", 
                                       length(combined.sK40$results$exclus)),
                                   rep("Semantic Coherence", 
                                       length(combined.sK40$results$semcoh)),
                                   rep("Held-Out Log-Likelihood", 
                                       length(combined.sK40$results$heldout)),
                                   rep("Lower Bound of Log-Likelihood", 
                                       length(combined.sK40$results$semcoh))))

fit.mfx <- data.frame(K = rep(combined.sK40$results$K[-1], 4),
                      fit = c(diff(combined.sK40$results$exclus)/
                                combined.sK40$results$exclus[-length(combined.sK40$results$K)],
                              diff(combined.sK40$results$semcoh)/
                                combined.sK40$results$semcoh[-length(combined.sK40$results$K)],
                              diff(combined.sK40$results$heldout)/
                                combined.sK40$results$heldout[-length(combined.sK40$results$K)],
                              diff(combined.sK40$results$lbound)/
                                combined.sK40$results$lbound[-length(combined.sK40$results$K)]),
                      stat = c(rep("Exclusivity", 
                                   length(combined.sK40$results$exclus) - 1),
                               rep("Semantic Coherence", 
                                   length(combined.sK40$results$semcoh) - 1),
                               rep("Held-Out Log-Likelihood", 
                                   length(combined.sK40$results$heldout) - 1),
                               rep("Lower Bound of Log-Likelihood", 
                                   length(combined.sK40$results$semcoh) - 1)))
```

We now create the plots. First, the raw statistics.

```{r}
ggplot(fit.indices, aes(x = K, y = fit)) + 
  geom_point(alpha = I(0.3)) + 
  facet_wrap(~stat, scales = "free_y") + 
  stat_smooth(level = 0.9, color = "black") + 
  theme_bw() + 
  ylab('Fit Statistic\n') + 
  xlab('\nNumber of Topics') +
  xlim(0,40)
```

And now the marginal changes.

```{r}
ggplot(fit.mfx, aes(x = K, y = fit)) + 
  geom_point(alpha = I(0.3)) + 
  facet_wrap(~stat, scales = "free_y") + 
  stat_smooth(level = 0.9, color = "black") + 
  theme_bw() + 
  ylab('Marginal Percentage\nChange in Fit Statistic\n') + 
  xlab('\nNumber of Topics') + 
  xlim(0,40) + 
  geom_hline(yintercept = 0, linetype = "dotted")
```


```{r, echo = FALSE, message = FALSE, warning = FALSE, error = FALSE, include = FALSE}
fit.plot.1 <- ggplot(fit.indices, aes(x = K, y = fit)) + 
  geom_point(alpha = I(0.3)) + 
  facet_wrap(~stat, scales = "free_y") + 
  stat_smooth(level = 0.9, color = "black") + 
  theme_bw() + 
  ylab('Fit Statistic\n') + 
  xlab('\nNumber of Topics') +
  xlim(0,40)

ggsave(fit.plot.1, file = "fit-plot-1.pdf", height = 3, width = 6)

fit.plot.2 <- ggplot(fit.mfx, aes(x = K, y = fit)) + 
  geom_point(alpha = I(0.3)) + 
  facet_wrap(~stat, scales = "free_y") + 
  stat_smooth(level = 0.9, color = "black") + 
  theme_bw() + 
  ylab('Marginal Percentage\nChange in Fit Statistic\n') + 
  xlab('\nNumber of Topics') + 
  xlim(0,40) + 
  geom_hline(yintercept = 0, linetype = "dotted")

ggsave(fit.plot.2, file = "fit-plot-2.pdf", height = 3, width = 6)
```

# Estimating the Model

The diagnostic results, along with examination of the topics, suggest that a model with twenty-eight topics might be optimal. We now estimate the model. We include as covariates time, document type, agency (CMS or HCFA), partisan/ideological variables, whether the administrator is serving in an acting capacity, and logged real spending. We further use the "Spectral" option as a starting value since it is both reproducible and previous research (Arora et al. 2013; Roberts et al. 2016) suggests it is globally consistent under reasonable assumptions.

```{r}
combined.stm28 <- stm(documents = combined.out$documents,
                     vocab = combined.out$vocab,
                     K = 28, # to calculate the "optimal" nubmer of topics
                     prevalence =  ~ s(numeric_date) + 
                       factor(type) + factor(agency) + 
                       unified * (z_relprescongdist + z_prescongdiff) + 
                       z_cfscore + acting + 
                       log(medicaid_fed_real) + log(medicare_fed_real),
                     init.type = "Spectral",
                     seed = 1384,
                     max.em.its = 1000, 
                     data = combined.out$meta)
```

```{r, echo = FALSE, message = FALSE, warning = FALSE, error = FALSE, include = FALSE}
saveRDS(combined.stm28, file = "combined-stm28.Rda") 
```

We now estimate regression models based on the method of composition. This can be done using the estimateEffect() command in the stm package, and accounts for the inherent uncertainty in the estimated topic proportions. The "Local" option indicates that the document-specific covariance matrices should be used. This is somewhat slower than the default "Global" option (which uses the "average" covariance matrix) but better accounts for the fact that different topics might have vastly different covariance matrices associated with them.

```{r}
set.seed(1384)
combined.est28 <-  estimateEffect(1:28  ~ s(numeric_date) + 
                                    factor(type) + factor(agency) + 
                                    unified * (z_relprescongdist + z_prescongdiff) + 
                                    z_cfscore + acting + 
                                    log(medicaid_fed_real) + log(medicare_fed_real), 
                                  stmobj = combined.stm28, 
                                  metadata = combined.out$meta, 
                                  uncertainty = "Local",
                                  nsims = 100,
                                  documents = combined.out$documents)

set.seed(1384)
allmods.28 <- summary(combined.est28)
```

```{r, echo = FALSE, message = FALSE, warning = FALSE, error = FALSE, include = FALSE}
write.csv(allmods.28$prob, file = "words-prob-28.csv")
write.csv(allmods.28$frex, file = "words-frex-28.csv")
write.csv(do.call(rbind.data.frame,findThoughts(combined.stm28, 
                                                texts = as.character(combined.out$meta$title), 
                                                n = 10)$docs), file = "docs-28.csv")
```

## Interpretation of the STM

# Relevant Words

For each topic, show the three words with the highest topic-level frequency.

```{r}
summary(combined.stm28)$prob[,1:3]
```

And do the same for topic-level exclusivity.

```{r}
summary(combined.stm28)$frex[,1:3]
```

After examining the words and most closely associated documents for each topic, give the topics qualitatively informative names.

```{r}
topics <- c("States & Counties",
            "Quarterly\nListings",
            "Skilled Nursing and\nHome Health",
            "Corrections",
            "Disproportionate\nShare",
            "Early Information\nCollection",
            "Accreditation\nand Approval",
            "Kidney Dialysis\nand ESRD",
            "Private Sector\nInsurance",
            "Appeals",
            "Demonstration\nProjects",
            "EDCA Committee",
            "Reimbursements",
            "(Proposed) Rules",
            "Later Information\nCollection",
            "Other Meetings",
            "Organ Procurement",
            "Privacy Act\nof 1974",
            "SCHIP",
            "DMEs and EHRs",
            "Surgical\nProcedures",
            "Hospital Payment\nSystems",
            "Prescription Drugs",
            "Statements of\nOrganization",
            "Conditions of\nParticipation",
            "Actuarial and\nPremium Rates",
            "Cities and\nMetro Areas",
            "Referrals")
```

# Figure 9

We now create figures to interpret the model. First, we create Figure 9, which looks at how the "Corrections" and "Rules and Proposed Rules" topics align with the same covariates in the model. First, we use the plot() function in the stm package to extract the estimates of interest, including relevant confidence intervals. Note that we set a seed twice because the estimates are based on simulation; in order to ensure the same standard errors each time we run the plot() function for the purpose of extracting confidence intervals, repeated setting of the seed is necessary given how the package operates.

```{r}
set.seed(1384)
type.preds.90 <- plot(combined.est28, covariate = "type", nsims = 1000, 
                      custom.labels = 1:28, ci.level = 0.9, 
                      omit.plot = TRUE)[]

set.seed(1384)
type.preds.95 <- plot(combined.est28, covariate = "type", nsims = 1000, 
                      custom.labels = 1:28, ci.level = 0.95, 
                      omit.plot = TRUE)[]
```

We now manipulate the package output for the purpose of inputting them into a plot object later.

```{r}
type.90.means <- data.frame(do.call(rbind.data.frame, 
                                    type.preds.90$means))
colnames(type.90.means) <- type.preds.90$uvals
type.90.means <- as.matrix(type.90.means)
type.90.lower.cis <- NULL
type.90.upper.cis <- NULL
for(i in combined.est28$topics){
  type.90.lower.cis <- rbind(type.90.lower.cis, 
                             type.preds.90$cis[[i]][1,])
  type.90.upper.cis <- rbind(type.90.upper.cis, 
                             type.preds.90$cis[[i]][2,])
}

colnames(type.90.lower.cis) <- type.preds.90$uvals
colnames(type.90.upper.cis) <- type.preds.90$uvals

type.90.means <- melt(type.90.means)
type.90.lower.cis <- melt(type.90.lower.cis)
type.90.upper.cis <- melt(type.90.upper.cis)

colnames(type.90.lower.cis)[3] <- "lower90"
colnames(type.90.upper.cis)[3] <- "upper90"

type.90 <- merge(merge(type.90.means, 
                       type.90.lower.cis), 
                 type.90.upper.cis)


type.95.means <- data.frame(do.call(rbind.data.frame, 
                                    type.preds.95$means))
colnames(type.95.means) <- type.preds.95$uvals
type.95.means <- as.matrix(type.95.means)
type.95.lower.cis <- NULL
type.95.upper.cis <- NULL
for(i in combined.est28$topics){
  type.95.lower.cis <- rbind(type.95.lower.cis, 
                             type.preds.95$cis[[i]][1,])
  type.95.upper.cis <- rbind(type.95.upper.cis, 
                             type.preds.95$cis[[i]][2,])
}

colnames(type.95.lower.cis) <- type.preds.95$uvals
colnames(type.95.upper.cis) <- type.preds.95$uvals

type.95.means <- melt(type.95.means)
type.95.lower.cis <- melt(type.95.lower.cis)
type.95.upper.cis <- melt(type.95.upper.cis)

colnames(type.95.lower.cis)[3] <- "lower95"
colnames(type.95.upper.cis)[3] <- "upper95"

type.95 <- merge(merge(type.95.means, 
                       type.95.lower.cis), 
                 type.95.upper.cis)

type.both <- merge(type.95, 
                   subset(type.90, 
                          select = -c(value)))
type.both$Var1 <- factor(type.both$Var1, 
                         labels = paste("Topic",
                                        levels(factor(type.both$Var1))))
type.both$Var2 <- factor(type.both$Var2, 
                         levels = c("Notice", "Rule", 
                                    "Proposed Rule", 
                                    "Correction", 
                                    "Uncategorized Document"))

type.both$Topic <- type.both$Var1
type.both$Topic <- factor(type.both$Topic, labels = topics)
```

We now plot how the document-type covariates are associated with the "Corrections" and "Rules and Proposed Rules" topics.

```{r}
ggplot(subset(type.both, Topic %in% c("Corrections", 
                                      "(Proposed) Rules") &
                                      Var2 != "Uncategorized Document"), 
                         aes(x = Var2, y = value, 
                             color = Topic, 
                             shape = Topic)) + 
  geom_point(position=position_dodge(width=0.75)) + 
  geom_errorbar(aes(ymin = lower95, ymax = upper95),
                width = 0,
                position=position_dodge(width=0.75)) +
  geom_errorbar(aes(ymin = lower90, ymax = upper90),
                width = 0, size = 1,
                position=position_dodge(width=0.75)) +
  geom_hline(yintercept = 0, linetype = "dotted") + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_colour_grey(start = 0.2, end = 0.6) + 
  ylab('Predicted Proportion of\nDocument Assigned to Topic') + 
  xlab('\nDocument Type')
```

# Figure 10

We now examine the Yackee and Yackee (2009) findings. First, we estimate a simple t-test to show the estimated "Rules and Proposed Rules" topic proportions under unified versus divided government.

```{r}
t.test(combined.stm28$theta[,14][combined.out$meta$unified == 1], 
       combined.stm28$theta[,14][combined.out$meta$unified == 0])
```

We now proceed to Figure 10. First, we use the plot function estimate the predicted effects for unified and divided government under various ideological arrangements. We save the outputs as objects in order to coerce them into data frames later on.

```{r}
set.seed(1384)
min.unified.90 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = min(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
min.unified.95 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = min(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
mean.unified.90 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = mean(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
mean.unified.95 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = mean(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
max.unified.90 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = max(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
max.unified.95 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = max(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 1]),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
min.divided.90 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = min(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
min.divided.95 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = min(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
mean.divided.90 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = mean(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
mean.divided.95 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = mean(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
max.divided.90 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = max(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
max.divided.95 <- plot(combined.est28, covariate = "unified", 
                       nsims = 1000, 
                       moderator = "z_prescongdiff",
                       moderator.value = max(combined.out$meta$z_prescongdiff[combined.out$meta$unified == 0]),
                       method = "difference", cov.value1 = 1, cov.value2 = 0,
                       topic = 14, ci.level = 0.95, omit.plot = T)

set.seed(1384)
mean.overall.90 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = mean(combined.out$meta$z_prescongdiff),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.9, omit.plot = T)

set.seed(1384)
mean.overall.95 <- plot(combined.est28, covariate = "unified", 
                        nsims = 1000, 
                        moderator = "z_prescongdiff",
                        moderator.value = mean(combined.out$meta$z_prescongdiff),
                        method = "difference", cov.value1 = 1, cov.value2 = 0,
                        topic = 14, ci.level = 0.95, omit.plot = T)
```

We now coerce the contents of the plot objects into a data frame for the purpose of feeding it into a ggplot call.

```{r}
topic14.frame <- rbind(data.frame(lower95 = min.unified.95$cis[[1]][1],
                                  lower90 = min.unified.90$cis[[1]][1],
                                  pred = min.unified.90$means[[1]],
                                  upper90 = min.unified.90$cis[[1]][2],
                                  upper95 = min.unified.95$cis[[1]][2],
                                  div = "Minimum Under\nUnified Government",
                                  hull = 0),
                       data.frame(lower95 = mean.unified.95$cis[[1]][1],
                                  lower90 = mean.unified.90$cis[[1]][1],
                                  pred = mean.unified.90$means[[1]],
                                  upper90 = mean.unified.90$cis[[1]][2],
                                  upper95 = mean.unified.95$cis[[1]][2],
                                  div = "Mean Under\nUnified Government",
                                  hull = 0),
                       data.frame(lower95 = max.unified.95$cis[[1]][1],
                                  lower90 = max.unified.90$cis[[1]][1],
                                  pred = max.unified.90$means[[1]],
                                  upper90 = max.unified.90$cis[[1]][2],
                                  upper95 = max.unified.95$cis[[1]][2],
                                  div = "Maximum Under\nUnified Government",
                                  hull = 0),
                       data.frame(lower95 = min.divided.95$cis[[1]][1],
                                  lower90 = min.divided.90$cis[[1]][1],
                                  pred = min.divided.90$means[[1]],
                                  upper90 = min.divided.90$cis[[1]][2],
                                  upper95 = min.divided.95$cis[[1]][2],
                                  div = "Minimum Under\nDivided Government",
                                  hull = 0),
                       data.frame(lower95 = mean.divided.95$cis[[1]][1],
                                  lower90 = mean.divided.90$cis[[1]][1],
                                  pred = mean.divided.90$means[[1]],
                                  upper90 = mean.divided.90$cis[[1]][2],
                                  upper95 = mean.divided.95$cis[[1]][2],
                                  div = "Mean Under\nDivided Government",
                                  hull = 1),
                       data.frame(lower95 = max.divided.95$cis[[1]][1],
                                  lower90 = max.divided.90$cis[[1]][1],
                                  pred = max.divided.90$means[[1]],
                                  upper90 = max.divided.90$cis[[1]][2],
                                  upper95 = max.divided.95$cis[[1]][2],
                                  div = "Maximum Under\nDivided Government",
                                  hull = 1),
                       data.frame(lower95 = mean.overall.95$cis[[1]][1],
                                  lower90 = mean.overall.90$cis[[1]][1],
                                  pred = mean.overall.90$means[[1]],
                                  upper90 = mean.overall.90$cis[[1]][2],
                                  upper95 = mean.overall.95$cis[[1]][2],
                                  div = "Overall Mean",
                                  hull = 1))
 
topic14.frame$div <- factor(topic14.frame$div, 
                            levels = levels(topic14.frame$div)[rev(order(topic14.frame$pred))])

topic14.frame$hull <- factor(topic14.frame$hull,
                             labels = c("Yes", "No"))
```

Plot the estimated probabilities for each partisan/ideological combination.

```{r}
ggplot(topic14.frame, aes(x = div, y = pred, 
                          color = factor(hull), 
                          shape = factor(hull))) + 
  geom_point() + 
  geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0) + 
  geom_errorbar(aes(ymin = lower90, ymax = upper90), size = 1, width = 0) +
  theme_bw() + 
  geom_hline(yintercept = 0, linetype = "dotted") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  ylab('Effect of Unified Government on\n(Proposed) Rule Topic Proportions\n') + 
  xlab('\nLevel of President-Congress Divergence') + 
  scale_colour_grey(start = 0.2, end = 0.6, 
                    name = "President-Congress\nDivergence Level\nObserved Under\nUnified Government?") +
  scale_shape_discrete(name = "President-Congress\nDivergence Level\nObserved Under\nUnified Government?")
```

# Figure 11

We now examine how usage of the "Prescription Drug" topic varies over time. First, we set up a date range for the x-axis. We create a vector of 100 points that is defined by the minimum and maximumm dates in the dataset. This is because the stm package's built-in plotting function for continuous variables uses a default of 100 points to predict expected topic usage. This date vector is combined with the stm package's plot output and the combined data frame is used to create a ggplot object.

```{r}
daterange <- seq(min(combined.out$meta$publication_date),
                 max(combined.out$meta$publication_date),
                 length.out = 100)

set.seed(1384)
plot.rx.95 <- plot(combined.est28, covariate = "numeric_date", topics  = 23, 
                   printlegend = FALSE, method = "continuous",
                   omit.plot = TRUE)[]

set.seed(1384)
plot.rx.90 <- plot(combined.est28, covariate = "numeric_date", topics  = 23, 
                   printlegend = FALSE, method = "continuous",
                   ci.level = .90, omit.plot = TRUE)[]

rx.frame <- data.frame(Date = daterange,
                       pred = plot.rx.90$means[[1]],
                       lower95 = plot.rx.95$ci[[1]][1,],
                       lower90 = plot.rx.90$ci[[1]][1,],
                       upper90 = plot.rx.90$ci[[1]][2,],
                       upper95 = plot.rx.95$ci[[1]][2,])
```

Here, we estimate the breakpoints and save them.

```{r}
rx.changepoints <- breakpoints(combined.stm28$theta[,23]~1, breaks = 10)
```

We now combine everything into a plot.

```{r}
ggplot(rx.frame, aes(x = Date, y = pred)) + 
                geom_ribbon(aes(ymin = lower90, ymax = upper90, color = NULL, alpha = I(0.3))) + 
                geom_ribbon(aes(ymin = lower95, ymax = upper95, color = NULL, alpha = I(0.3))) + 
                geom_line(aes(x = Date, y = pred)) + 
                theme_bw() + 
                geom_vline(xintercept = combined.out$meta$publication_date[rx.changepoints$breakpoints[1]], 
                           linetype = "dashed") + 
                geom_vline(xintercept = combined.out$meta$publication_date[rx.changepoints$breakpoints[2]],
                           linetype = "dashed") + 
                geom_vline(xintercept = as.Date("2003-12-08"), linetype = "dashed") + 
                geom_vline(xintercept = as.Date("2012-06-28"), linetype = "dashed") + 
                ylab('Expected Proportion of Document\nAssigned to Prescription Drug Topic\n') + 
                xlab('\nDate') +     
                annotate("text", x = as.Date("2000-09-01"), y = -0.05, 
                         label = "Signing of the MMA\n(December 8, 2003)") +
                annotate("segment", x = as.Date("2000-09-01"), y = -0.06, 
                         xend = as.Date("2003-12-01"), yend = -0.075,
                         arrow = arrow(angle = 45, length = unit(2, "mm"), type = "closed")) + 
                annotate("text", x = as.Date("2007-02-01"), y = -0.05, 
                         label = "Estimated Breakpoint\n(December 15, 2003)") +
                annotate("segment", x = as.Date("2007-02-01"), y = -0.06, 
                         xend = as.Date("2003-12-22"), yend = -0.075,
                         arrow = arrow(angle = 45, length = unit(2, "mm"), type = "closed")) +
                annotate("text", x = as.Date("2014-12-31"), y = 0.085, 
                         label = "NFIB v. Sebelius\n(June 28, 2012)") +
                annotate("segment", x = as.Date("2014-12-31"), y = 0.075, 
                         xend = as.Date("2012-07-04"), yend = 0.06,
                         arrow = arrow(angle = 45, length = unit(2, "mm"), type = "closed")) + 
                annotate("text", x = as.Date("2009-03-01"), y = 0.085, 
                         label = "Estimated Breakpoint\n(April 18, 2012)") +
                annotate("segment", x = as.Date("2009-03-01"), y = 0.075, 
                         xend = as.Date("2012-04-11"), yend = 0.06,
                         arrow = arrow(angle = 45, length = unit(2, "mm"), type = "closed"))
```

# Figure 12

This figure creation chunk and the next one follow the same general format as before; we use the stm package's built-in plotting function to estimate effects and confidence intervals, coerce the plot output into a data frame, and use that data frame in a ggplot call.


```{r}
set.seed(1384)
cfscore.plot.90 <- plot(combined.est28, covariate = "z_cfscore", 
                        cov.value1 = max(combined.out$meta$z_cfscore), 
                        cov.value2 = min(combined.out$meta$z_cfscore), 
                        nsims = 1000, 
                        method = "difference", ci.level = (1-0.1)^(1/28), omit.plot = TRUE)[]

set.seed(1384)
cfscore.plot.95 <- plot(combined.est28, covariate = "z_cfscore", 
                        cov.value1 = max(combined.out$meta$z_cfscore), 
                        cov.value2 = min(combined.out$meta$z_cfscore), 
                        nsims = 1000, 
                        method = "difference", ci.level = (1-0.05)^(1/28), omit.plot = TRUE)[]

cfscore.frame <- data.frame(Topic = 1:28,
                            lower95 = sapply(cfscore.plot.95$cis, "[[", 1),
                            lower90 = sapply(cfscore.plot.90$cis, "[[", 1),
                            pred = unlist(cfscore.plot.90$means),
                            upper90 = sapply(cfscore.plot.90$cis, "[[", 2),
                            upper95 = sapply(cfscore.plot.95$cis, "[[", 2))


cfscore.frame$Topic <- factor(cfscore.frame$Topic, labels = topics)
cfscore.frame$Topic2 <- factor(gsub("\n", " ", as.character(cfscore.frame$Topic)), 
                               levels = gsub("\n", " ", 
                                             as.character(cfscore.frame$Topic)))


cfscore.ggplot <- ggplot(cfscore.frame, 
                         aes(x = fct_rev(Topic2), 
                             y = pred)) +
  geom_point() + 
  geom_errorbar(aes(ymin = lower90, ymax = upper90), 
                size = 1, width = 0) + 
  geom_errorbar(aes(ymin = lower95, ymax = upper95), 
                width = 0) + 
  geom_hline(yintercept = 0, 
             linetype = "dotted") + 
  coord_flip() + 
  theme_bw() + 
  xlab('Topic\n') + 
  ylab('\nPredicted Effect of Administrator\nConservatism on Topic Proportion')
```

# Figure 13


```{r}
set.seed(1384)
reldist.unified.plot.90 <- plot(combined.est28, covariate = "z_relprescongdist", 
                           cov.value1 = max(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 1]), 
                           cov.value2 = min(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 1]), 
                           nsims = 1000, 
                           moderator = "unified",
                           moderator.value = 1,
                           method = "difference", ci.level = (1-0.1)^(1/28), omit.plot = TRUE)[]

set.seed(1384)
reldist.divided.plot.90 <- plot(combined.est28, covariate = "z_relprescongdist", 
                                cov.value1 = max(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 0]), 
                                cov.value2 = min(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 0]), 
                                nsims = 1000, 
                                moderator = "unified",
                                moderator.value = 0,
                                method = "difference", ci.level = (1-0.1)^(1/28), omit.plot = TRUE)[]

set.seed(1384)
reldist.unified.plot.95 <- plot(combined.est28, covariate = "z_relprescongdist", 
                                cov.value1 = max(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 1]), 
                                cov.value2 = min(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 1]), 
                                nsims = 1000, 
                                moderator = "unified",
                                moderator.value = 1,
                                method = "difference", ci.level = (1-0.05)^(1/28), omit.plot = TRUE)[]

set.seed(1384)
reldist.divided.plot.95 <- plot(combined.est28, covariate = "z_relprescongdist", 
                                cov.value1 = max(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 0]), 
                                cov.value2 = min(combined.out$meta$z_relprescongdist[combined.out$meta$unified == 0]), 
                                nsims = 1000, 
                                moderator = "unified",
                                moderator.value = 0,
                                method = "difference", ci.level = (1-0.05)^(1/28), omit.plot = TRUE)[]


reldist.frame <- data.frame(Topic = 1:28,
                            lower95 = c(sapply(reldist.unified.plot.95$cis, "[[", 1),
                                        sapply(reldist.divided.plot.95$cis, "[[", 1)),
                            lower90 = c(sapply(reldist.unified.plot.90$cis, "[[", 1),
                                        sapply(reldist.divided.plot.90$cis, "[[", 1)),
                            pred = c(unlist(reldist.unified.plot.90$means),
                                     unlist(reldist.divided.plot.90$means)),
                            upper90 = c(sapply(reldist.unified.plot.90$cis, "[[", 2),
                                        sapply(reldist.divided.plot.90$cis, "[[", 2)),
                            upper95 = c(sapply(reldist.unified.plot.95$cis, "[[", 2),
                                        sapply(reldist.divided.plot.95$cis, "[[", 2)),
                            govtype = c(rep("Unified", length(reldist.unified.plot.95$topics)),
                                        rep("Divided", length(reldist.unified.plot.95$topics))))


reldist.frame$Topic <- factor(reldist.frame$Topic, labels = c("States & Counties",
                                                              "Quarterly Listings",
                                                              "Skilled Nursing and Home Health",
                                                              "Corrections",
                                                              "Disproportionate Share",
                                                              "Early Information Collection",
                                                              "Accreditation and Approval",
                                                              "Kidney Dialysis and ESRD",
                                                              "Private Sector Insurance",
                                                              "Appeals",
                                                              "Demonstration Projects",
                                                              "EDCA Committee",
                                                              "Reimbursements",
                                                              "(Proposed) Rules",
                                                              "Later Information Collection",
                                                              "Other Meetings",
                                                              "Organ Procurement",
                                                              "Privacy Act of 1974",
                                                              "SCHIP",
                                                              "DMEs and EHRs",
                                                              "Surgical Procedures",
                                                              "Hospital Payment Systems",
                                                              "Prescription Drugs",
                                                              "Statements of Organization",
                                                              "Conditions of Participation",
                                                              "Actuarial and Premium Rates",
                                                              "Cities and Metro Areas",
                                                              "Referrals"))

reldist.ggplot <- ggplot(reldist.frame, 
                          aes(x = fct_rev(Topic), 
                              y = pred, color = govtype)) +
                     geom_point(position=position_dodge(width=-0.75)) + 
                     geom_errorbar(aes(ymin = lower90, ymax = upper90), 
                                   size = 1, width = 0, position=position_dodge(width=-0.75)) + 
                     geom_errorbar(aes(ymin = lower95, ymax = upper95), 
                                   width = 0, position=position_dodge(width=-0.75)) + 
                     geom_hline(yintercept = 0, 
                                linetype = "dotted") + 
                     coord_flip() + 
                     theme_bw() +   
                     scale_colour_grey(start = 0.2, end = 0.6, 
                                       name = "Government Type") + 
                     scale_shape_discrete(name = "Government Type") + 
                     xlab('Topic\n') + 
                     ylab('\nPredicted Effect of Relative Administrator\nDistance from President on Topic Proportion') +    
                     geom_vline(xintercept = seq(2,28) - 0.5, linetype = "solid", 
                                size = 0.15, alpha = I(0.5)) + 
                     theme(axis.text.x = element_text(angle = 45, hjust = 1),
                           panel.grid.major.y = element_blank()) 
```


## Regression Tables (Appendix)

These set up the regression tables for the Appendix. One HTML file for each topic is created. The models are first estimated using simple linear regression in order to get objects that can be easily passed to the texreg() function, and then the relevant fit statistics from the estimateEffect() output are used to override.

```{r}
coef.names <- c("Constant",
                "B-Spline 1",
                "B-Spline 2",
                "B-Spline 3",
                "B-Spline 4",
                "B-Spline 5",
                "B-Spline 6",
                "B-Spline 7",
                "B-Spline 8",
                "B-Spline 9",
                "B-Spline 10",
                "Correction",
                "Proposed Rule",
                "Rule",
                "Uncategorized Document",
                "HCFA",
                "Unified Government",
                "Relative Distance from the President",
                "President-Congress Divergence",
                "Administrator Ideology",
                "Acting Administrator",
                "Ln(Federal Medicaid Spending)",
                "Ln(Federal Medicare Spending)",
                "Unified Government * Relative President-Senate Divergence",
                "Unified Government * President-Congress Divergence")

for(i in 1:28){
assign(paste("mod.", i, sep=""),
       lm(combined.stm28$theta[,i] ~ stm::s(numeric_date) + 
               factor(type) + factor(agency) + 
               unified * (z_relprescongdist + z_prescongdiff) + 
               z_cfscore + acting + 
               log(medicaid_fed) + log(medicare_fed), 
             data = combined.out$meta))
}

for(i in 1:28){
htmlreg(file = paste("Regression",i,".html", sep = ""),
        eval(parse(text = paste("mod.", i, sep=""))), 
        stars = c(0.1, 0.05, 0.01), digits = 3,
          custom.coef.names = coef.names,
          override.coef = allmods.28$tables[[i]][,1],
          override.se =  allmods.28$tables[[i]][,2],
          override.pvalues =  allmods.28$tables[[i]][,4],
          omit.coef = "(Spline)")
}
```
